{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8UYNvAExBmKR"
      },
      "source": [
        "##### Copyright 2021 Google LLC. Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "\n",
        "# Open Buildings - spatial analysis examples\n",
        "\n",
        "This notebook demonstrates some analysis methods with [Open Buildings](https://sites.research.google/open-buildings/) data: \n",
        "\n",
        "* Generating heatmaps of building density and size.\n",
        "* A simple analysis of accessibility to health facilities."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0eXL156ae-iT"
      },
      "outputs": [],
      "source": [
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "# https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LbGgdE4mj1hd"
      },
      "source": [
        "### Download buildings data for a region in Africa [takes up to 15 minutes for large countries]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "qP6ADuzRdZTF"
      },
      "outputs": [],
      "source": [
        "#@markdown Select a region from either the [Natural Earth low res](https://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-admin-0-countries/) (fastest), [Natural Earth high res](https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/) or [World Bank high res](https://datacatalog.worldbank.org/dataset/world-bank-official-boundaries) shapefiles:\n",
        "region_border_source = 'Natural Earth (Low Res 110m)'  #@param [\"Natural Earth (Low Res 110m)\", \"Natural Earth (High Res 10m)\", \"World Bank (High Res 10m)\"]\n",
        "region = 'GHA (Ghana)'  #@param [\"\", \"AGO (Angola)\", \"BDI (Burundi)\", \"BEN (Benin)\", \"BFA (Burkina Faso)\", \"BGD (Bangladesh)\", \"BRN (Brunei)\", \"BTN (Bhutan)\", \"BWA (Botswana)\", \"CAF (Central African Republic)\", \"CIV (C\\u00f4te d'Ivoire)\", \"CMR (Cameroon)\", \"COD (Democratic Republic of the Congo)\", \"COG (Republic of the Congo)\", \"COM (Comoros)\", \"CPV (Cape Verde)\", \"DJI (Djibouti)\", \"DZA (Algeria)\", \"EGY (Egypt)\", \"ERI (Eritrea)\", \"ETH (Ethiopia)\", \"GAB (Gabon)\", \"GHA (Ghana)\", \"GIN (Guinea)\", \"GMB (The Gambia)\", \"GNB (Guinea-Bissau)\", \"GNQ (Equatorial Guinea)\", \"IDN (Indonesia)\", \"IOT (British Indian Ocean Territory)\", \"KEN (Kenya)\", \"KHM (Cambodia)\", \"LAO (Laos)\", \"LBR (Liberia)\", \"LKA (Sri Lanka)\", \"LSO (Lesotho)\", \"MDG (Madagascar)\", \"MDV (Maldives)\", \"MOZ (Mozambique)\", \"MRT (Mauritania)\", \"MUS (Mauritius)\", \"MWI (Malawi)\", \"MYS (Malaysia)\", \"MYT (Mayotte)\", \"NAM (Namibia)\", \"NER (Niger)\", \"NGA (Nigeria)\", \"NPL (Nepal)\", \"PHL (Philippines)\", \"REU (R\\u00e9union)\", \"RWA (Rwanda)\", \"SDN (Sudan)\", \"SEN (Senegal)\", \"SGP (Singapore)\", \"SHN (Saint Helena, Ascension and Tristan da Cunha)\", \"SLE (Sierra Leone)\", \"SOM (Somalia)\", \"STP (S\\u00e3o Tom\\u00e9 and Pr\\u00edncipe)\", \"SWZ (Eswatini)\", \"SYC (Seychelles)\", \"TGO (Togo)\", \"THA (Thailand)\", \"TLS (Timor-Leste)\", \"TUN (Tunisia)\", \"TZA (Tanzania)\", \"UGA (Uganda)\", \"VNM (Vietnam)\", \"ZAF (South Africa)\", \"ZMB (Zambia)\", \"ZWE (Zimbabwe)\"]\n",
        "\n",
        "#@markdown Alternatively, specify an area of interest in [WKT format](https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry) (assumes crs='EPSG:4326'); this [tool](https://arthur-e.github.io/Wicket/sandbox-gmaps3.html) might be useful.\n",
        "your_own_wkt_polygon = ''  #@param {type:\"string\"}\n",
        "#@markdown Select type of data to download here:\n",
        "data_type = 'points'  #@param [\"polygons\", \"points\"]\n",
        "#@markdown In this analysis we only need centroids of buildings, therefore by default we use the data in the *point* format (files in *polygons* format are larger). Refer to [Open Buildings Data Format](https://sites.research.google/open-buildings/#dataformat) for more details.\n",
        "!sudo apt-get install swig\n",
        "!pip install s2geometry pygeos geopandas\n",
        "\n",
        "import functools\n",
        "import glob\n",
        "import gzip\n",
        "import multiprocessing\n",
        "import os\n",
        "import shutil\n",
        "import tempfile\n",
        "from typing import List, Optional, Tuple\n",
        "\n",
        "\n",
        "import geopandas as gpd\n",
        "from google.colab import files\n",
        "from IPython import display\n",
        "from mpl_toolkits.axes_grid1 import make_axes_locatable\n",
        "from osgeo import gdal\n",
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import s2geometry as s2\n",
        "import shapely\n",
        "import tensorflow as tf\n",
        "import tqdm.notebook\n",
        "\n",
        "BUILDING_DOWNLOAD_PATH = ('gs://open-buildings-data/v3/'\n",
        "                          f'{data_type}_s2_level_6_gzip_no_header')\n",
        "\n",
        "def get_filename_and_region_dataframe(\n",
        "    region_border_source: str, region: str,\n",
        "    your_own_wkt_polygon: str) -\u003e Tuple[str, gpd.geodataframe.GeoDataFrame]:\n",
        "  \"\"\"Returns output filename and a geopandas dataframe with one region row.\"\"\"\n",
        "\n",
        "  if your_own_wkt_polygon:\n",
        "    filename = f'open_buildings_v3_{data_type}_your_own_wkt_polygon.csv.gz'\n",
        "    region_df = gpd.GeoDataFrame(\n",
        "        geometry=gpd.GeoSeries.from_wkt([your_own_wkt_polygon]),\n",
        "        crs='EPSG:4326')\n",
        "    if not isinstance(region_df.iloc[0].geometry,\n",
        "                      shapely.geometry.polygon.Polygon) and not isinstance(\n",
        "                          region_df.iloc[0].geometry,\n",
        "                          shapely.geometry.multipolygon.MultiPolygon):\n",
        "      raise ValueError(\"`your_own_wkt_polygon` must be a POLYGON or \"\n",
        "                      \"MULTIPOLYGON.\")\n",
        "    print(f'Preparing your_own_wkt_polygon.')\n",
        "    return filename, region_df\n",
        "\n",
        "  if not region:\n",
        "    raise ValueError('Please select a region or set your_own_wkt_polygon.')\n",
        "\n",
        "  if region_border_source == 'Natural Earth (Low Res 110m)':\n",
        "    url = ('https://www.naturalearthdata.com/http//www.naturalearthdata.com/'\n",
        "           'download/110m/cultural/ne_110m_admin_0_countries.zip')\n",
        "    !wget -N {url}\n",
        "    display.clear_output()\n",
        "    region_shapefile_path = os.path.basename(url)\n",
        "    source_name = 'ne_110m'\n",
        "  elif region_border_source == 'Natural Earth (High Res 10m)':\n",
        "    url = ('https://www.naturalearthdata.com/http//www.naturalearthdata.com/'\n",
        "           'download/10m/cultural/ne_10m_admin_0_countries.zip')\n",
        "    !wget -N {url}\n",
        "    display.clear_output()\n",
        "    region_shapefile_path = os.path.basename(url)\n",
        "    source_name = 'ne_10m'\n",
        "  elif region_border_source == 'World Bank (High Res 10m)':\n",
        "    url = ('https://development-data-hub-s3-public.s3.amazonaws.com/ddhfiles/'\n",
        "           '779551/wb_countries_admin0_10m.zip')\n",
        "    !wget -N {url}\n",
        "    !unzip -o {os.path.basename(url)}\n",
        "    display.clear_output()\n",
        "    region_shapefile_path = 'WB_countries_Admin0_10m'\n",
        "    source_name = 'wb_10m'\n",
        "\n",
        "  region_iso_a3 = region.split(' ')[0]\n",
        "  filename = (f'open_buildings_v3_{data_type}_'\n",
        "              f'{source_name}_{region_iso_a3}.csv.gz')\n",
        "  region_df = gpd.read_file(region_shapefile_path).query(\n",
        "      f'ISO_A3 == \"{region_iso_a3}\"').dissolve(by='ISO_A3')[['geometry']]\n",
        "  print(f'Preparing {region} from {region_border_source}.')\n",
        "  return filename, region_df\n",
        "\n",
        "\n",
        "def get_bounding_box_s2_covering_tokens(\n",
        "    region_geometry: shapely.geometry.base.BaseGeometry) -\u003e List[str]:\n",
        "  region_bounds = region_geometry.bounds\n",
        "  s2_lat_lng_rect = s2.S2LatLngRect_FromPointPair(\n",
        "      s2.S2LatLng_FromDegrees(region_bounds[1], region_bounds[0]),\n",
        "      s2.S2LatLng_FromDegrees(region_bounds[3], region_bounds[2]))\n",
        "  coverer = s2.S2RegionCoverer()\n",
        "  # NOTE: Should be kept in-sync with s2 level in BUILDING_DOWNLOAD_PATH.\n",
        "  coverer.set_fixed_level(6)\n",
        "  coverer.set_max_cells(1000000)\n",
        "  return [cell.ToToken() for cell in coverer.GetCovering(s2_lat_lng_rect)]\n",
        "\n",
        "\n",
        "def s2_token_to_shapely_polygon(\n",
        "    s2_token: str) -\u003e shapely.geometry.polygon.Polygon:\n",
        "  s2_cell = s2.S2Cell(s2.S2CellId_FromToken(s2_token, len(s2_token)))\n",
        "  coords = []\n",
        "  for i in range(4):\n",
        "    s2_lat_lng = s2.S2LatLng(s2_cell.GetVertex(i))\n",
        "    coords.append((s2_lat_lng.lng().degrees(), s2_lat_lng.lat().degrees()))\n",
        "  return shapely.geometry.Polygon(coords)\n",
        "\n",
        "\n",
        "def download_s2_token(\n",
        "    s2_token: str, region_df: gpd.geodataframe.GeoDataFrame) -\u003e Optional[str]:\n",
        "  \"\"\"Downloads the matching CSV file with polygons for the `s2_token`.\n",
        "\n",
        "  NOTE: Only polygons inside the region are kept.\n",
        "  NOTE: Passing output via a temporary file to reduce memory usage.\n",
        "\n",
        "  Args:\n",
        "    s2_token: S2 token for which to download the CSV file with building\n",
        "      polygons. The S2 token should be at the same level as the files in\n",
        "      BUILDING_DOWNLOAD_PATH.\n",
        "    region_df: A geopandas dataframe with only one row that contains the region\n",
        "      for which to keep polygons.\n",
        "\n",
        "  Returns:\n",
        "    Either filepath which contains a gzipped CSV without header for the\n",
        "    `s2_token` subfiltered to only contain building polygons inside the region\n",
        "    or None which means that there were no polygons inside the region for this\n",
        "    `s2_token`.\n",
        "  \"\"\"\n",
        "  s2_cell_geometry = s2_token_to_shapely_polygon(s2_token)\n",
        "  region_geometry = region_df.iloc[0].geometry\n",
        "  prepared_region_geometry = shapely.prepared.prep(region_geometry)\n",
        "  # If the s2 cell doesn't intersect the country geometry at all then we can\n",
        "  # know that all rows would be dropped so instead we can just return early.\n",
        "  if not prepared_region_geometry.intersects(s2_cell_geometry):\n",
        "    return None\n",
        "  try:\n",
        "    # Using tf.io.gfile.GFile gives better performance than passing the GCS path\n",
        "    # directly to pd.read_csv.\n",
        "    with tf.io.gfile.GFile(\n",
        "        os.path.join(BUILDING_DOWNLOAD_PATH, f'{s2_token}_buildings.csv.gz'),\n",
        "        'rb') as gf:\n",
        "      # If the s2 cell is fully covered by country geometry then can skip\n",
        "      # filtering as we need all rows.\n",
        "      if prepared_region_geometry.covers(s2_cell_geometry):\n",
        "        with tempfile.NamedTemporaryFile(mode='w+b', delete=False) as tmp_f:\n",
        "          shutil.copyfileobj(gf, tmp_f)\n",
        "          return tmp_f.name\n",
        "      # Else take the slow path.\n",
        "      # NOTE: We read in chunks to save memory.\n",
        "      csv_chunks = pd.read_csv(\n",
        "          gf, chunksize=2000000, dtype=object, compression='gzip', header=None)\n",
        "      tmp_f = tempfile.NamedTemporaryFile(mode='w+b', delete=False)\n",
        "      tmp_f.close()\n",
        "      for csv_chunk in csv_chunks:\n",
        "        points = gpd.GeoDataFrame(\n",
        "            geometry=gpd.points_from_xy(csv_chunk[1], csv_chunk[0]),\n",
        "            crs='EPSG:4326')\n",
        "        # sjoin 'within' was faster than using shapely's 'within' directly.\n",
        "        points = gpd.sjoin(points, region_df, predicate='within')\n",
        "        csv_chunk = csv_chunk.iloc[points.index]\n",
        "        csv_chunk.to_csv(\n",
        "            tmp_f.name,\n",
        "            mode='ab',\n",
        "            index=False,\n",
        "            header=False,\n",
        "            compression={\n",
        "                'method': 'gzip',\n",
        "                'compresslevel': 1\n",
        "            })\n",
        "      return tmp_f.name\n",
        "  except tf.errors.NotFoundError:\n",
        "    return None\n",
        "\n",
        "# Clear output after pip install.\n",
        "display.clear_output()\n",
        "filename, region_df = get_filename_and_region_dataframe(region_border_source,\n",
        "                                                        region,\n",
        "                                                        your_own_wkt_polygon)\n",
        "# Remove any old outputs to not run out of disk.\n",
        "for f in glob.glob('/tmp/open_buildings_*'):\n",
        "  os.remove(f)\n",
        "# Write header to the compressed CSV file.\n",
        "with gzip.open(f'/tmp/{filename}', 'wt') as merged:\n",
        "  if data_type == \"polygons\":\n",
        "    merged.write(','.join([\n",
        "        'latitude', 'longitude', 'area_in_meters', 'confidence', 'geometry',\n",
        "        'full_plus_code'\n",
        "    ]) + '\\n')\n",
        "  else:\n",
        "    merged.write(','.join([\n",
        "        'latitude', 'longitude', 'area_in_meters', 'confidence',\n",
        "        'full_plus_code'\n",
        "    ]) + '\\n')\n",
        "download_s2_token_fn = functools.partial(download_s2_token, region_df=region_df)\n",
        "s2_tokens = get_bounding_box_s2_covering_tokens(region_df.iloc[0].geometry)\n",
        "# Downloads CSV files for relevant S2 tokens and after filtering appends them\n",
        "# to the compressed output CSV file. Relies on the fact that concatenating\n",
        "# gzipped files produces a valid gzip file.\n",
        "# NOTE: Uses a pool to speed up output preparation.\n",
        "with open(f'/tmp/{filename}', 'ab') as merged:\n",
        "  with multiprocessing.Pool(4) as e:\n",
        "    for fname in tqdm.notebook.tqdm(\n",
        "        e.imap_unordered(download_s2_token_fn, s2_tokens),\n",
        "        total=len(s2_tokens)):\n",
        "      if fname:\n",
        "        with open(fname, 'rb') as tmp_f:\n",
        "          shutil.copyfileobj(tmp_f, merged)\n",
        "        os.unlink(fname)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UY9Ba1Cxpcb4"
      },
      "source": [
        "# Visualise the data\n",
        "\n",
        "First we convert the CSV file into a GeoDataFrame. The CSV files can be quite large when they include the polygon outline of every building, that is, when the data type downloaded is polygons. For this example we only need longitude and latitude, so we only process those columns to save memory."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hSpb1JKVjuYj"
      },
      "outputs": [],
      "source": [
        "buildings = pd.read_csv(\n",
        "    f\"/tmp/{filename}\", engine=\"c\",\n",
        "    usecols=['latitude', 'longitude', 'area_in_meters', 'confidence'])\n",
        "\n",
        "print(f\"Read {len(buildings):,} records.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7yi3IAq2fk-6"
      },
      "source": [
        "For some countries there can be tens of millions of buildings, so we also take a random sample for doing plots."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "J4YmtqUyf4sU"
      },
      "outputs": [],
      "source": [
        "sample_size = 200000  #@param"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dRGBn87afzRu"
      },
      "outputs": [],
      "source": [
        "buildings_sample = (buildings.sample(sample_size)\n",
        "                    if len(buildings) \u003e sample_size else buildings)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ge4EbCVTj5Vk"
      },
      "outputs": [],
      "source": [
        "plt.plot(buildings_sample.longitude, buildings_sample.latitude, 'k.',\n",
        "         alpha=0.25, markersize=0.5)\n",
        "plt.gcf().set_size_inches(10, 10)\n",
        "plt.xlabel('Longitude')\n",
        "plt.ylabel('Latitude')\n",
        "plt.axis('equal');"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0Vpk1lJteYyx"
      },
      "source": [
        "# Prepare the data for mapping building statistics\n",
        "\n",
        "Set up a grid, which we will use to calculate statistics about buildings.\n",
        "\n",
        "We also want to select the examples most likely to be buildings, using a threshold on the confidence score."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "I8yao05wYTba"
      },
      "outputs": [],
      "source": [
        "max_grid_dimension = 1000 #@param\n",
        "confidence_threshold = 0.75 #@param"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tb6AM8KQCZX-"
      },
      "outputs": [],
      "source": [
        "buildings = buildings.query(f\"confidence \u003e {confidence_threshold}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "24_FjZ8yrNjj"
      },
      "outputs": [],
      "source": [
        "# Create a grid covering the dataset bounds\n",
        "min_lon = buildings.longitude.min() \n",
        "max_lon = buildings.longitude.max() \n",
        "min_lat = buildings.latitude.min()\n",
        "max_lat = buildings.latitude.max() \n",
        "\n",
        "grid_density_degrees = (max(max_lon - min_lon, max_lat - min_lat)\n",
        "                        / max_grid_dimension)\n",
        "\n",
        "bounds = [min_lon, min_lat, max_lon, max_lat]\n",
        "xcoords = np.arange(min_lon, max_lon, grid_density_degrees)\n",
        "ycoords = np.arange(max_lat, min_lat, -grid_density_degrees)\n",
        "xv, yv = np.meshgrid(xcoords, ycoords)\n",
        "xy = np.stack([xv.ravel(), yv.ravel()]).transpose()\n",
        "\n",
        "print(f\"Calculated grid of size {xv.shape[0]} x {xv.shape[1]}.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ophg9IrprQ0Y"
      },
      "source": [
        "To calculate statistics, we need a function to convert between (longitude, latitude) coordinates in the world and (x, y) coordinates in the grid."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qE_5xKBpbMcs"
      },
      "outputs": [],
      "source": [
        "geotransform = (min_lon, grid_density_degrees, 0,\n",
        "                max_lat, 0, -grid_density_degrees)\n",
        "\n",
        "def lonlat_to_xy(lon, lat, geotransform):\n",
        "    x = int((lon - geotransform[0])/geotransform[1])\n",
        "    y = int((lat - geotransform[3])/geotransform[5])\n",
        "    return x,y"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "R0-QQNjZneNH"
      },
      "source": [
        "Now we can count how many buildings there are on each cell of the grid. "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "czf5TZicWeGL"
      },
      "outputs": [],
      "source": [
        "counts = np.zeros(xv.shape)\n",
        "area_totals = np.zeros(xv.shape)\n",
        "\n",
        "for lat, lon, area in tqdm.notebook.tqdm(\n",
        "    zip(buildings.latitude, buildings.longitude, buildings.area_in_meters)):\n",
        "  x, y = lonlat_to_xy(lon, lat, geotransform)\n",
        "  if x \u003e= 0 and y \u003e= 0 and x \u003c len(xcoords) and y \u003c len(ycoords):\n",
        "    counts[y, x] += 1\n",
        "    area_totals[y, x] += area\n",
        "\n",
        "area_totals[counts == 0] = np.nan\n",
        "counts[counts == 0] = np.nan\n",
        "mean_area = area_totals / counts"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_gZeJwyhm7lM"
      },
      "source": [
        "# Plot the counts of buildings\n",
        "\n",
        "Knowing the counts of buildings is useful for example in planning service delivery, estimating population or designing census enumeration areas."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VrMupotmWeBV"
      },
      "outputs": [],
      "source": [
        "plt.imshow(np.log10(np.nan_to_num(counts) + 1.), cmap=\"viridis\")\n",
        "plt.gcf().set_size_inches(15, 15)\n",
        "cbar = plt.colorbar(shrink=0.5)\n",
        "cbar.ax.set_yticklabels([f'{x:.0f}' for x in 10 ** cbar.ax.get_yticks()]) \n",
        "plt.title(\"Building counts per grid cell\");"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cFMRgXr93k0G"
      },
      "source": [
        "## [optional] Export a GeoTIFF file\n",
        "\n",
        "This can be useful to carry our further analysis with software such as [QGIS](https://qgis.org/)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "jmTlPdc73vzB"
      },
      "outputs": [],
      "source": [
        "def save_geotiff(filename, values, geotransform):\n",
        "  driver = gdal.GetDriverByName(\"GTiff\")\n",
        "  dataset = driver.Create(filename, values.shape[1], values.shape[0], 1,\n",
        "                          gdal.GDT_Float32)\n",
        "  dataset.SetGeoTransform(geotransform)\n",
        "  band = dataset.GetRasterBand(1)\n",
        "  band.WriteArray(values)\n",
        "  band.SetNoDataValue(-1)\n",
        "  dataset.FlushCache()\n",
        "\n",
        "filename = \"building_counts.tiff\"\n",
        "save_geotiff(filename, counts, geotransform)\n",
        "files.download(filename)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "O-0pi0uxm-_c"
      },
      "source": [
        "# Generate a map of building sizes\n",
        "\n",
        "Knowing average building sizes is useful too -- it is linked, for example, to how much economic activity there is in each area."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BiZGeEqPnBPo"
      },
      "outputs": [],
      "source": [
        "# Only calculate the mean building size for grid locations with at\n",
        "# least a few buildings, so that we get more reliable averages.\n",
        "mean_area_filtered = mean_area.copy()\n",
        "mean_area_filtered[counts \u003c 10] = 0\n",
        "\n",
        "# Set a maximum value for the colour scale, to make the plot brighter.\n",
        "plt.imshow(np.nan_to_num(mean_area_filtered), vmax=250, cmap=\"viridis\")\n",
        "plt.title(\"Mean building size (m$^2$)\")\n",
        "plt.colorbar(shrink=0.5, extend=\"max\")\n",
        "plt.gcf().set_size_inches(15, 15)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "G-jTJGounB3S"
      },
      "source": [
        "# Health facility accessibility\n",
        "\n",
        "We can combine different types of geospatial data to get various insights. If we have information on the locations of clinics and hospitals across Ghana, for example, then one interesting analysis is how accessible health services are in different places.\n",
        "\n",
        "In this example, we'll look at the average distance to the nearest health facility.  \n",
        "\n",
        "We use this [data](https://data.humdata.org/m/dataset/ghana-healthsites) made available by [Global Healthsites Mapping Project](https://healthsites.io/).\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PeOLKjCaNSl2"
      },
      "outputs": [],
      "source": [
        "health_sites = pd.read_csv(\n",
        "    \"https://data.humdata.org/dataset/364c5aca-7cd7-4248-b394-335113293c7a/\"\n",
        "    \"resource/b7e55f34-9e3b-417f-b329-841cff6a9554/download/ghana.csv\")\n",
        "health_sites = gpd.GeoDataFrame(\n",
        "    health_sites, geometry=gpd.points_from_xy(health_sites.X, health_sites.Y))\n",
        "health_sites.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "d0r4lJCwhtCv"
      },
      "source": [
        "We drop all columns not relevant to the computation of mean distance from health facilities. We also exclude all rows with empty or NaN values, select amenities captured as hospitals in the new geodata and choose values within the range of our area of interest."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VlU_elaUJ1lT"
      },
      "outputs": [],
      "source": [
        "health_sites = health_sites[['X', 'Y', 'amenity', 'name', 'geometry']]\n",
        "health_sites.dropna(axis=0, inplace=True)\n",
        "health_sites = health_sites[health_sites['amenity'].isin(\n",
        "    ['hospital', 'clinic', 'health_post', 'doctors'])]\n",
        "health_sites = health_sites.query(f'Y \u003e {min_lat} and  Y \u003c {max_lat}'\n",
        "                                  f'and X \u003e {min_lon} and X \u003c {max_lon}')\n",
        "health_sites.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AgEFFmmbiA5D"
      },
      "source": [
        "Have a look at the locations of health facilities compared to the locations of buildings.\n",
        "\n",
        "*Note: this data may not be complete.*"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fdn1pXT1XAia"
      },
      "outputs": [],
      "source": [
        "plt.plot(buildings_sample.longitude,\n",
        "         buildings_sample.latitude,\n",
        "         'k.', alpha=0.25, markersize=0.5)\n",
        "plt.plot(health_sites.X, health_sites.Y, \n",
        "         marker='$\\\\oplus$', color= 'red', alpha = 0.8,\n",
        "         markersize=10, linestyle='None')\n",
        "plt.gcf().set_size_inches(10, 10)\n",
        "plt.xlabel('Longitude')\n",
        "plt.ylabel('Latitude')\n",
        "plt.legend(['Building', 'Health center'])\n",
        "plt.axis('equal');"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7leA8WZBhZX6"
      },
      "source": [
        "Next we calculate, for each building, the distance to the nearest health facility. We use the sample of the buildings data that we took earlier, so that the computations don't take too long."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CJRNAPpMgnqJ"
      },
      "outputs": [],
      "source": [
        "buildings_sample = gpd.GeoDataFrame(\n",
        "    buildings_sample,\n",
        "    geometry=gpd.points_from_xy(buildings_sample.longitude,\n",
        "                                buildings_sample.latitude))\n",
        "\n",
        "buildings_sample[\n",
        "    \"distance_to_nearest_health_facility\"] = buildings_sample.geometry.apply(\n",
        "        lambda g: health_sites.distance(g).min())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MF6-elrV0OTl"
      },
      "outputs": [],
      "source": [
        "buildings_sample.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QQDNRlYWzdQt"
      },
      "source": [
        "That has computed the distance in degrees (longitude and latitude), which is not very intuitive. We can convert this approximately to kilometers by  multiplying with the distance spanned by one degree at the equator."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "EAM8m7A4zM2T"
      },
      "outputs": [],
      "source": [
        "buildings_sample[\"distance_to_nearest_health_facility\"] *= 111.32"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "D6D9K9LriYrt"
      },
      "source": [
        "Now we can then find the mean distance to the nearest health facility by administrative area. First, we load data on the shapes of adminstrative areas.\n",
        "\n",
        "We use this [data](https://data.humdata.org/m/dataset/ghana-administrative-boundaries) made available by [OCHA ROWCA](https://www.unocha.org/rowca) - United Nations Office for the Coordination of Humanitarian Affairs for West and Central Africa."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OF2BN9Cqgcn3"
      },
      "outputs": [],
      "source": [
        "!wget https://data.humdata.org/dataset/dc4c17cf-59d9-478c-b2b7-acd889241194/resource/4443ddba-eeaf-4367-9457-7820ea482f7f/download/gha_admbnda_gss_20210308_shp.zip\n",
        "!unzip gha_admbnda_gss_20210308_shp.zip\n",
        "display.clear_output()\n",
        "admin_areas = gpd.read_file(\"gha_admbnda_gss_20210308_SHP/gha_admbnda_adm2_gss_20210308.shp\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IwTTyxd5nK5z"
      },
      "source": [
        "Next, find the average distance to the nearest health facility within each area."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8VA0NeSznKHF"
      },
      "outputs": [],
      "source": [
        "# Both data frames have the same coordinate system.\n",
        "buildings_sample.crs = admin_areas.crs\n",
        "\n",
        "# Spatial join to find out which administrative area every building is in.\n",
        "points_polys = gpd.sjoin(buildings_sample, admin_areas, how=\"left\")\n",
        "\n",
        "# Aggregate by admin area to get the average distance to nearest health facility.\n",
        "stats = points_polys.groupby(\n",
        "    \"index_right\")[\"distance_to_nearest_health_facility\"].agg([\"mean\"])\n",
        "admin_areas_with_distances = gpd.GeoDataFrame(stats.join(admin_areas))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3XR7hfFJpn_Q"
      },
      "outputs": [],
      "source": [
        "admin_areas_with_distances.plot(\n",
        "    column=\"mean\", legend=True, legend_kwds={\"shrink\": 0.5})\n",
        "plt.title(\"Average distance to the nearest health facility (km)\")\n",
        "plt.gcf().set_size_inches(15, 15)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "name": "open_buildings_spatial_analysis_examples.ipynb",
      "private_outputs": true,
      "provenance": [
        {
          "file_id": "1lVqavF0oWVYTpprD-Sgp654VD1-O86n4",
          "timestamp": 1637254081290
        }
      ]
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
